# 1 select pairs for trading
# 2 fit marginal distribution for returns for the securities chosen
# 3 make marginal returns unniformly distributed using the probability integral transform
# 4 fit copula to transformed returns
# 5 use fitted copula to calculate conditional probabilities
# 6 enter and exit long / short positions when conditional probabilities fall outside confidence bands
import pandas as pd
import pandas_datareader.data as web
import seaborn
from statsmodels.tsa.stattools import coint, adfuller
import numpy as np
import datetime as dt
import sqlite3
import time
import config
import statsmodels.tsa.stattools as ts
from statsmodels.tsa.vector_ar.vecm import coint_johansen
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
from copula import *
import plotly.express as px
import ndtest # bivariate Kolmogorov-Smirnov
import warnings
warnings.filterwarnings("ignore")
class Trading:
def __init__(self, condition, start, end):
if condition == 1:
self.period1 = start
self.period2 = end
else:
self.period1 = int(time.mktime((dt.datetime.now() - dt.timedelta(365*35)).timetuple()))
self.period2 = int(time.mktime(dt.datetime.now().timetuple()))
self.fred_start = str(dt.datetime.now() - dt.timedelta(365*35)).split()[0]
self.fred_end = dt.datetime.now()
self.interval = '1d'
self.ticker_list = ["^GSPC","SPXL","XLK","XLU","XLI","XLY","XLP","XLB","XLM","XLV","XLE",
"AGG","JNK","LQD","CWB","BKLN","TIP","TLT","MUB","MBB","XLCPR","XLRE","USO",
"GDX","GDXJ","QQQ", "KBE","KRE","XSB","XME","XHB","KCE", "XRT","XOP","XES",
"XSW","XAR","XITK","XBI","KIE","XHE","XHS","DIA","SPAB","SPSB","SPIB","SHY"
"SPLB","SPBO","SPTS","SPTI","SPTI","SPTL","SPMB","SPHY","SPIP", "HYG", "IEF",
"TMF","TYD","TYO"]
def yahoo_universe(self):
web3 = []
failed = []
for security in self.ticker_list:
try:
interval = '1d'
query_string = f'https://query1.finance.yahoo.com/v7/finance/download/{security}?period1={self.period1}&period2={self.period2}&interval={self.interval}&events=history&includeAdjustedClose=true'
df = pd.read_csv(query_string).set_index("Date")
df['ticker'] = security
df.columns = ['Open', 'High', 'Low','Close','Adj Close','Volume','ticker']
web3.append(df)
except:
failed.append(security)
pass
final = pd.concat(web3)
df_final = final.reset_index()
return df_final, web3, failed
def trading_sql(self, data_file):
try:
conn = sqlite3.connect('tradable_universe')
data_file.to_sql("consensusinvestable", conn, if_exists='replace', index = False)
conn.commit()
conn.close()
print("Updating the sql table was a success")
except:
print("Was unsuccessful in uploading the dataframe into the datatable. ")
def to_trading_sql(self, data):
"""
Inputs: dataframe you want to upload
Outputs: saves SQL table to the investable database
"""
try:
connection = sqlite3.connect(config.db_file)
data.to_sql("consensusinvestable", connection, if_exists='replace', index=False)
connection.commit()
connection.close()
print("Uploading dataframe to database worked successfully.")
except:
print("Was unable to save the dataframe into the investable datatable.")
def load_sql(self):
try:
conn = sqlite3.connect('tradable_universe')
df = pd.read_sql_query("SELECT * from consensusinvestable", conn)
conn.commit()
conn.close()
except:
print("Could not retrieve the datatable. ")
return df
def ticker_filter(self, ticker, final):
return final[final['ticker']==f"{ticker}"]
def dataframe(self, tickers, final, start, end):
"""
Inputs: The tickers you want to filter by and a raw unfiltered data query from trading database.
Outputs: A dataframe with the columns being the names of the selected tickers and the values being
the adjusted adjusted close prices.
"""
new_df = []
for ticker in tickers:
new = final[final['ticker']==f"{ticker}"]
new_df.append(new)
main = pd.concat(new_df)
df = main.pivot_table(index = main.index, columns = "ticker", values = 'Adj Close')
return df[start:end]
main = Trading(0, "2018-01-01", "2020-01-01") # set first input to one if you want exact date inputs
df, df1, df2 = main.yahoo_universe() ### dont have to run this everytime main.trading_sql(df) # placing the data into my sql datatable => consensusinvestable
final = main.load_sql().set_index("Date")
tickers = ["^GSPC","SPXL","XLK","XLU","XLI","XLY","XLP","XLB","XLV","XLE",
"AGG","JNK","LQD","CWB","BKLN","TIP","TLT","MUB","MBB","USO",
"GDX","GDXJ","QQQ", "KBE","KRE","DIA","SPAB","SPSB","SPIB","SHY"
"SPLB","SPBO","SPTS","SPTI","SPTL","SPMB","SPHY","SPIP", "HYG", "IEF",
"TMF","TYD","TYO"]
start = '2000-01-01'
end = '2023-01-18'
df = main.dataframe(tickers, final, start, end)
df = df.dropna()
df
| ticker | AGG | BKLN | CWB | DIA | GDX | GDXJ | HYG | IEF | JNK | KBE | ... | USO | XLB | XLE | XLI | XLK | XLP | XLU | XLV | XLY | ^GSPC |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Date | |||||||||||||||||||||
| 2012-06-19 | 85.995331 | 15.456649 | 23.818144 | 101.705917 | 43.258907 | 72.617805 | 51.459663 | 89.747704 | 63.863827 | 17.475275 | ... | 253.440002 | 28.407240 | 44.965324 | 28.944820 | 24.637094 | 25.962149 | 25.844496 | 31.679304 | 38.621426 | 1357.979980 |
| 2012-06-20 | 85.863953 | 15.475798 | 23.792805 | 101.626503 | 42.886936 | 71.897118 | 51.659920 | 89.564964 | 64.076920 | 17.523483 | ... | 243.919998 | 28.246611 | 44.788113 | 28.789946 | 24.679869 | 25.826250 | 25.592970 | 31.595167 | 38.577660 | 1355.689941 |
| 2012-06-21 | 85.979904 | 15.418324 | 23.551884 | 99.625084 | 40.664108 | 66.337532 | 51.236561 | 89.755989 | 63.716236 | 17.113712 | ... | 235.679993 | 27.322996 | 42.954594 | 28.227514 | 24.098166 | 25.524277 | 25.327467 | 31.157635 | 37.737495 | 1325.510010 |
| 2012-06-22 | 85.856209 | 15.463034 | 23.646980 | 100.252518 | 40.446358 | 66.783669 | 51.585541 | 89.365616 | 63.994984 | 17.346724 | ... | 240.800003 | 27.467567 | 43.240864 | 28.325333 | 24.363346 | 25.599773 | 25.334442 | 31.485794 | 37.903782 | 1335.020020 |
| 2012-06-25 | 86.126801 | 15.456649 | 23.469479 | 99.085045 | 40.972576 | 66.165939 | 51.505474 | 89.855736 | 63.962147 | 16.953018 | ... | 238.399994 | 27.122211 | 42.272999 | 27.803654 | 23.867186 | 25.441238 | 25.285536 | 31.098736 | 37.317425 | 1313.719971 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2023-01-11 | 99.326591 | 20.818163 | 66.197830 | 338.916870 | 31.570000 | 39.099998 | 75.842575 | 98.429214 | 92.903275 | 46.700001 | ... | 68.050003 | 82.970001 | 88.139999 | 101.980003 | 129.160004 | 74.980003 | 72.080002 | 135.279999 | 138.169998 | 3969.610107 |
| 2023-01-12 | 100.044830 | 20.887327 | 66.647408 | 341.151306 | 32.180000 | 39.900002 | 76.270508 | 99.307335 | 93.400719 | 47.150002 | ... | 68.610001 | 83.279999 | 89.820000 | 102.580002 | 130.119995 | 74.389999 | 71.589996 | 134.850006 | 138.399994 | 3983.169922 |
| 2023-01-13 | 99.675728 | 20.877447 | 67.007065 | 342.188751 | 32.650002 | 40.400002 | 76.270508 | 98.778465 | 93.480316 | 47.270000 | ... | 70.050003 | 83.790001 | 89.949997 | 102.459999 | 130.490005 | 74.739998 | 71.330002 | 135.449997 | 139.699997 | 3999.090088 |
| 2023-01-17 | 99.496170 | 20.897207 | 67.216873 | 338.318359 | 31.520000 | 39.099998 | 76.011757 | 98.469131 | 93.211693 | 47.090000 | ... | 70.860001 | 82.949997 | 90.139999 | 101.589996 | 131.080002 | 74.790001 | 71.230003 | 134.820007 | 139.800003 | 3990.969971 |
| 2023-01-18 | 100.483757 | 20.887327 | 67.017059 | 332.143707 | 31.209999 | 38.560001 | 76.101326 | 99.806267 | 93.301231 | 45.700001 | ... | 69.510002 | 81.779999 | 88.489998 | 99.680000 | 129.389999 | 72.750000 | 69.519997 | 132.919998 | 137.979996 | 3928.860107 |
2663 rows × 41 columns
### this kendall correlation coefficient is helpful in fitting joint probability distributions to copulas
# could also make this for one year preceding training and 1 year after testing
# base longs off Composite Macro Indicator
def kendall(df, train_start, train_end, test_start, test_end):
"""
Inputs: dataframe with all the raw price data (df.set_index("Date")), train_start is the date in string form
of the training dataset (as is train_end, test_start, and test end)
Outputs: training data kendall tau's
"""
returns = np.log(df).diff().dropna()
#prices_train = df[0:int(df.shape[0]*.7)]
prices_train = df[train_start:train_end]
#prices_test = df[-(df.shape[0] - int(df.shape[0]*.7)):]
prices_test = df[test_start: test_end]
#returns_train = returns[0:int(returns.shape[0]*.7)]
returns_train = returns[train_start:train_end]
#returns_test = returns[-(returns.shape[0] - int(returns.shape[0]*.7)):]
returns_test = returns[test_start: test_end]
results = pd.DataFrame(columns=['tau'])
for s1 in returns_train.columns:
for s2 in returns_train.columns:
if (s1!=s2) and (f'{s2}-{s1}' not in results.index):
results.loc[f'{s1}-{s2}'] = stats.kendalltau(returns_train[s1], returns_train[s2])[0]
return results, returns_train, returns_test, prices_train, prices_test
#results = kendall(df, "2015-01-01", "2015-12-31", "2016-01-01","2016-12-31")[0]
#returns_train = kendall(df)[1]
#returns_test = kendall(df)[2]
#prices_train = kendall(df)[3]
#prices_test = kendall(df)[4]
results,returns_train,returns_test, prices_train, prices_test = kendall(df, "2015-01-01", "2015-12-31", "2016-01-01","2016-12-31")
results.sort_values('tau', ascending = False).head(50) # want to see the returns that are most correlated in terms of kendall's tau
| tau | |
|---|---|
| TLT-TMF | 0.976237 |
| SPXL-^GSPC | 0.968444 |
| SPTL-TLT | 0.941973 |
| SPTL-TMF | 0.938861 |
| KBE-KRE | 0.907918 |
| DIA-SPXL | 0.878075 |
| DIA-^GSPC | 0.876621 |
| SPIP-TIP | 0.846381 |
| QQQ-XLK | 0.824556 |
| AGG-IEF | 0.814838 |
| GDX-GDXJ | 0.812161 |
| HYG-JNK | 0.810866 |
| IEF-SPAB | 0.798532 |
| XLI-^GSPC | 0.793189 |
| AGG-SPAB | 0.791474 |
| SPXL-XLI | 0.790596 |
| DIA-XLI | 0.788003 |
| QQQ-SPXL | 0.786189 |
| IEF-TMF | 0.784987 |
| IEF-SPTL | 0.783918 |
| IEF-TLT | 0.783791 |
| QQQ-^GSPC | 0.778790 |
| SPXL-XLK | 0.758850 |
| XLK-^GSPC | 0.755245 |
| SPXL-XLY | 0.743063 |
| XLY-^GSPC | 0.741356 |
| SPAB-SPTL | 0.740250 |
| SPAB-TLT | 0.737180 |
| SPAB-TMF | 0.736006 |
| AGG-LQD | 0.733543 |
| AGG-TLT | 0.733124 |
| AGG-TMF | 0.732804 |
| AGG-SPTL | 0.732618 |
| DIA-XLK | 0.730392 |
| IEF-LQD | 0.728052 |
| DIA-QQQ | 0.727756 |
| CWB-^GSPC | 0.727460 |
| DIA-XLY | 0.726494 |
| CWB-SPXL | 0.720881 |
| LQD-TLT | 0.712370 |
| LQD-SPAB | 0.711305 |
| LQD-SPTL | 0.710473 |
| LQD-TMF | 0.709965 |
| IEF-SPTI | 0.692326 |
| CWB-QQQ | 0.687922 |
| QQQ-XLY | 0.687347 |
| SPXL-XLV | 0.681591 |
| XLV-^GSPC | 0.677481 |
| CWB-DIA | 0.674448 |
| IEF-MBB | 0.674301 |
def parse_pair(pair):
s1 = pair[:pair.find('-')]
s2 = pair[pair.find('-')+1:]
return s1,s2
def show_pairs():
selected_stocks = [] # creating empty lists for the stocks
selected_pairs = [] # creating an empty list for the identified higher correlated pairs.
for pair in results.sort_values(by='tau', ascending=False).index:
s1,s2 = parse_pair(pair) # selecting the pairs
if (s1 not in selected_stocks) and (s2 not in selected_stocks):
selected_stocks.append(s1) # adding the first pair
selected_stocks.append(s2) # adding the second pair
selected_pairs.append(pair) # adding pairs to selected pairs list => stops after 20
if len(selected_pairs) == 50: # selecting 30 pairs with the highest Kendall's rank correlation
break
return selected_stocks, selected_pairs
selected_stocks = show_pairs()[0]
selected_pairs = show_pairs()[1]
selected_pairs
['TLT-TMF', 'SPXL-^GSPC', 'KBE-KRE', 'SPIP-TIP', 'QQQ-XLK', 'AGG-IEF', 'GDX-GDXJ', 'HYG-JNK', 'DIA-XLI', 'SPAB-SPTL', 'LQD-SPIB', 'CWB-XLY', 'MBB-SPTI', 'XLB-XLE', 'XLP-XLV', 'MUB-TYD', 'SPBO-SPTS', 'BKLN-USO', 'SPMB-SPSB', 'SPHY-XLU']
def marginals(selcted_stocks):
marginals_df = pd.DataFrame(index=selected_stocks, columns=['Distribution', 'AIC', 'BIC', 'KS_pvalue'])
for stock in selected_stocks:
data = returns_train[stock]
dists = ['Normal', "Student's t", 'Logistic', 'Extreme']
best_aic = np.inf # selection process is based on the best AIC
for dist,name in zip([stats.norm, stats.t, stats.genlogistic, stats.genextreme], dists):
params = dist.fit(data)
dist_fit = dist(*params)
log_like = np.log(dist_fit.pdf(data)).sum()
aic = 2*len(params) - 2 * log_like
if aic<best_aic:
best_dist = name
best_aic = aic # selection process is based on the best AIC
best_bic = len(params) * np.log(len(data)) - 2 * log_like
ks_pval = stats.kstest(data, dist_fit.cdf, N=100)[1]
marginals_df.loc[stock] = [best_dist, best_aic, best_bic, ks_pval]
### P-values were quite large which is a good thing meaning we can't reject the null hypothesis that the distributions are the same
### Low KS-Test P-values means that the parametric distributions are not a good fit for the data
return marginals_df
### creating a new dataframe of securities that fit their respective distributions
marginals_df = marginals(selected_stocks)
new_marg = marginals_df[marginals_df['KS_pvalue'] > 0.10]
df = df[new_marg.index]
marginals_df
| Distribution | AIC | BIC | KS_pvalue | |
|---|---|---|---|---|
| TLT | Normal | -1610.760654 | -1603.701795 | 0.586381 |
| TMF | Normal | -1062.530229 | -1055.471371 | 0.419009 |
| SPXL | Student's t | -1077.419825 | -1066.831537 | 0.679778 |
| ^GSPC | Student's t | -1627.912715 | -1617.324427 | 0.674828 |
| KBE | Logistic | -1473.359466 | -1462.771179 | 0.999844 |
| KRE | Student's t | -1458.679084 | -1448.090797 | 0.968607 |
| SPIP | Normal | -2078.972581 | -2071.913723 | 0.750715 |
| TIP | Student's t | -2130.657051 | -2120.068764 | 0.93879 |
| QQQ | Student's t | -1564.853114 | -1554.264826 | 0.921391 |
| XLK | Student's t | -1562.898441 | -1552.310154 | 0.972787 |
| AGG | Normal | -2323.835803 | -2316.776944 | 0.666722 |
| IEF | Normal | -2044.583808 | -2037.524949 | 0.541826 |
| GDX | Student's t | -1074.754705 | -1064.166417 | 0.820958 |
| GDXJ | Normal | -1051.351878 | -1044.293019 | 0.908549 |
| HYG | Student's t | -2053.086546 | -2042.498258 | 0.965974 |
| JNK | Logistic | -2094.320008 | -2083.731721 | 0.688192 |
| DIA | Logistic | -1626.772388 | -1616.184101 | 0.754299 |
| XLI | Logistic | -1600.002338 | -1589.414051 | 0.84609 |
| SPAB | Normal | -2320.441858 | -2313.383 | 0.291949 |
| SPTL | Normal | -1644.395013 | -1637.336155 | 0.651338 |
| LQD | Logistic | -2096.407575 | -2085.819287 | 0.764367 |
| SPIB | Logistic | -2384.139415 | -2373.551128 | 0.413805 |
| CWB | Logistic | -1825.223375 | -1814.635087 | 0.320368 |
| XLY | Logistic | -1599.074169 | -1588.485882 | 0.965182 |
| MBB | Logistic | -2502.507265 | -2491.918977 | 0.932075 |
| SPTI | Logistic | -2466.017288 | -2455.429001 | 0.361394 |
| XLB | Logistic | -1530.76709 | -1520.178803 | 0.983511 |
| XLE | Logistic | -1381.761192 | -1371.172905 | 0.748201 |
| XLP | Logistic | -1678.533174 | -1667.944887 | 0.850258 |
| XLV | Logistic | -1541.028177 | -1530.439889 | 0.996379 |
| MUB | Logistic | -2470.483304 | -2459.895017 | 0.943945 |
| TYD | Logistic | -1474.445374 | -1463.857087 | 0.071785 |
| SPBO | Logistic | -2089.800962 | -2079.212675 | 0.728049 |
| SPTS | Student's t | -2548.103569 | -2537.515281 | 0.142942 |
| BKLN | Logistic | -2403.252032 | -2392.663744 | 0.149573 |
| USO | Logistic | -1111.298255 | -1100.709968 | 0.826598 |
| SPMB | Normal | -2334.849811 | -2327.790953 | 0.529391 |
| SPSB | Normal | -2845.80951 | -2838.750652 | 0.077819 |
| SPHY | Logistic | -2016.368063 | -2005.779776 | 0.58341 |
| XLU | Logistic | -1579.001328 | -1568.413041 | 0.975958 |
def copulas(selected_pairs, returns_train):
copulas_df = pd.DataFrame(index=selected_pairs, columns=['copula', 'parameter', 'aic', 'bic', 'KS_pvalue'])
for pair in selected_pairs:
s1,s2 = parse_pair(pair)
params_s1 = stats.t.fit(returns_train[s1]) # fit marginals
dist_s1 = stats.t(*params_s1)
params_s2 = stats.t.fit(returns_train[s2]) # fit marginals
dist_s2 = stats.t(*params_s2)
u = dist_s1.cdf(returns_train[s1]) # probability integral transform
v = dist_s2.cdf(returns_train[s2]) # probability integral transform
best_aic = np.inf
# GaussianCopula()
for copula in [ClaytonCopula(), GumbelCopula(), FrankCopula(), JoeCopula()]:
copula.fit(u,v)
L = copula.log_likelihood(u,v)
aic = 2 * copula.num_params - 2 * L
if aic < best_aic:
best_aic = aic
best_bic = copula.num_params * np.log(len(u)) - 2 * L
best_copula = copula.name
# calculate KS-pvalue
smp = copula.sample(size=len(u)) # generate sample from fit copula
s_u = smp[:,0]
s_v = smp[:,1]
ks_pval = ndtest.ks2d2s(u,v,s_u,s_v)
if isinstance(copula, ArchimedeanCopula):
best_param = copula.alpha
else:
best_param = copula.rho
copulas_df.loc[pair] = [best_copula, best_param, best_aic, best_bic, ks_pval]
return copulas_df
# a KS value below 0.05 or close is bad as it says these distributions are very close to one another
copulas_df = copulas(selected_pairs, returns_train)
copulas_df
| copula | parameter | aic | bic | KS_pvalue | |
|---|---|---|---|---|---|
| TLT-TMF | Gumbel | 19.999993 | -1470.560308 | -1467.030879 | 0.841738 |
| SPXL-^GSPC | Gumbel | 19.999993 | -1431.466213 | -1427.936784 | 0.33278 |
| KBE-KRE | Gumbel | 9.287345 | -907.798652 | -904.269223 | 0.254518 |
| SPIP-TIP | Frank | 19.999994 | -621.388576 | -617.859147 | 0.178558 |
| QQQ-XLK | Gumbel | 6.008995 | -693.107085 | -689.577656 | 0.49749 |
| AGG-IEF | Gumbel | 5.387482 | -599.444074 | -595.914645 | 0.353594 |
| GDX-GDXJ | Frank | 18.964843 | -564.082877 | -560.553448 | 0.301371 |
| HYG-JNK | Gumbel | 4.75396 | -575.195083 | -571.665654 | 0.480766 |
| DIA-XLI | Gumbel | 4.265284 | -526.302833 | -522.773404 | 0.659108 |
| SPAB-SPTL | Frank | 13.023785 | -412.03326 | -408.503831 | 0.123467 |
| LQD-SPIB | Gumbel | 2.692679 | -294.567726 | -291.038297 | 0.230292 |
| CWB-XLY | Gumbel | 2.559714 | -277.171703 | -273.642274 | 0.204521 |
| MBB-SPTI | Frank | 7.72779 | -231.767919 | -228.23849 | 0.163916 |
| XLB-XLE | Gumbel | 2.10329 | -209.080794 | -205.551364 | 0.560357 |
| XLP-XLV | Gumbel | 2.156211 | -209.652546 | -206.123117 | 0.496145 |
| MUB-TYD | Frank | 4.324432 | -98.648962 | -95.119533 | 0.132821 |
| SPBO-SPTS | Gumbel | 1.413525 | -45.293663 | -41.764234 | 0.398201 |
| BKLN-USO | Frank | 2.360884 | -33.521638 | -29.992208 | 0.092707 |
| SPMB-SPSB | Clayton | 0.432169 | -13.563769 | -10.03434 | 0.079563 |
| SPHY-XLU | Gumbel | 1.109499 | -3.798345 | -0.268915 | 0.384843 |
new_copula_df = copulas_df[copulas_df['KS_pvalue']>0.1].sort_values("KS_pvalue",ascending=False)
selected_pairs = new_copula_df.index # new copulas with KS p_values greater than 0.1 (could alter to 0.05)
selected_pairs
Index(['TLT-TMF', 'DIA-XLI', 'XLB-XLE', 'QQQ-XLK', 'XLP-XLV', 'HYG-JNK',
'SPBO-SPTS', 'SPHY-XLU', 'AGG-IEF', 'SPXL-^GSPC', 'GDX-GDXJ', 'KBE-KRE',
'LQD-SPIB', 'CWB-XLY', 'SPIP-TIP', 'MBB-SPTI', 'MUB-TYD', 'SPAB-SPTL'],
dtype='object')
algo_returns = {}
cl = 0.85 # confidence bands
history = []
for pair in selected_pairs:
s1,s2 = parse_pair(pair)
params_s1 = stats.t.fit(returns_train[s1]) # fit marginals
dist_s1 = stats.t(*params_s1)
params_s2 = stats.t.fit(returns_train[s2]) # fit marginals
dist_s2 = stats.t(*params_s2)
u = dist_s1.cdf(returns_train[s1]) # transform marginals
v = dist_s2.cdf(returns_train[s2]) # transform marginals
best_aic = np.inf # fit copula
best_copula = None # could incorporate a mixture of copulas explaining each relationship
### could take out the gaussian copula because the problems with the 08 GFC
# GaussianCopula()
copulas = [ClaytonCopula(), GumbelCopula(), FrankCopula(), JoeCopula()] # copulas called from copulas.py file
for copula in copulas:
copula.fit(u,v)
L = copula.log_likelihood(u,v)
aic = 2 * copula.num_params - 2 * L
if aic < best_aic: # choose the copula with the lowest AIC
best_aic = aic
best_copula = copula # best copula to fit the joint distribution using AIC
prob_s1 = [] # calculate conditional probabilities
prob_s2 = [] # calculate conditional probabilities
for u,v in zip(dist_s1.cdf(returns_test[s1]), dist_s2.cdf(returns_test[s2])):
prob_s1.append(best_copula.cdf_u_given_v(u,v))
prob_s2.append(best_copula.cdf_v_given_u(u,v))
probs_trade = pd.DataFrame(np.vstack([prob_s1, prob_s2]).T, index=returns_test.index, columns=[s1, s2])
history.append(probs_trade)
positions = pd.DataFrame(index=probs_trade.index, columns=probs_trade.columns)
long = False
short = False
for t in positions.index:
# if long position is open
if long:
if (probs_trade.loc[t][s1] > 0.5) or (probs_trade.loc[t][s2] < 0.5):
positions.loc[t] = [0,0]
long = False
else:
positions.loc[t] = [1,-1]
# if short position is open
elif short:
if (probs_trade.loc[t][s1] < 0.5) or (probs_trade.loc[t][s2] > 0.5):
positions.loc[t] = [0,0]
short = False
else:
positions.loc[t] = [-1,1]
# if no positions are open
else:
if (probs_trade.loc[t][s1] < (1-cl)) and (probs_trade.loc[t][s2] > cl):
# open long position
positions.loc[t] = [1,-1]
long = True
elif (probs_trade.loc[t][s1] > cl) and (probs_trade.loc[t][s2] < (1-cl)):
# open short positions
positions.loc[t] = [-1,1]
short = True
else:
positions.loc[t] = [0,0]
# calculate returns
algo_ret = (returns_test * positions.shift()).sum(axis=1)
algo_returns[pair] = algo_ret
for i in history:
print(i) # prints the results of the return testing data frame
TLT TMF
Date
2016-01-04 0.000139 0.999840
2016-01-05 0.498514 0.493757
2016-01-06 0.556754 0.478497
2016-01-07 0.454763 0.559304
2016-01-08 0.549488 0.472055
... ... ...
2016-12-23 0.722734 0.288077
2016-12-27 0.272835 0.726411
2016-12-28 0.807231 0.211313
2016-12-29 0.192905 0.820350
2016-12-30 0.162074 0.845676
[252 rows x 2 columns]
DIA XLI
Date
2016-01-04 0.167538 0.596566
2016-01-05 0.238922 0.808807
2016-01-06 0.357686 0.359901
2016-01-07 0.366034 0.169416
2016-01-08 0.305240 0.524729
... ... ...
2016-12-23 0.411516 0.638777
2016-12-27 0.325333 0.728427
2016-12-28 0.781337 0.134598
2016-12-29 0.446698 0.584672
2016-12-30 0.556438 0.429441
[252 rows x 2 columns]
XLB XLE
Date
2016-01-04 0.032367 0.905688
2016-01-05 0.354001 0.748423
2016-01-06 0.157770 0.110175
2016-01-07 0.060303 0.380104
2016-01-08 0.321217 0.404465
... ... ...
2016-12-23 0.669966 0.429927
2016-12-27 0.758987 0.403492
2016-12-28 0.284033 0.480577
2016-12-29 0.583170 0.478075
2016-12-30 0.250572 0.691505
[252 rows x 2 columns]
QQQ XLK
Date
2016-01-04 0.037722 0.882561
2016-01-05 0.587852 0.403870
2016-01-06 0.671561 0.217379
2016-01-07 0.290255 0.355758
2016-01-08 0.365453 0.548820
... ... ...
2016-12-23 0.408004 0.624373
2016-12-27 0.724306 0.337771
2016-12-28 0.539670 0.363746
2016-12-29 0.210003 0.806755
2016-12-30 0.161425 0.762378
[252 rows x 2 columns]
XLP XLV
Date
2016-01-04 0.270278 0.188756
2016-01-05 0.813373 0.373220
2016-01-06 0.595031 0.245856
2016-01-07 0.344326 0.126649
2016-01-08 0.488121 0.154803
... ... ...
2016-12-23 0.311580 0.831490
2016-12-27 0.426498 0.643879
2016-12-28 0.381481 0.379991
2016-12-29 0.812441 0.312016
2016-12-30 0.315767 0.557216
[252 rows x 2 columns]
HYG JNK
Date
2016-01-04 0.262892 0.527599
2016-01-05 0.258283 0.825426
2016-01-06 0.815556 0.207479
2016-01-07 0.227218 0.551922
2016-01-08 0.765432 0.161336
... ... ...
2016-12-23 0.146533 0.911983
2016-12-27 0.910368 0.072783
2016-12-28 0.415803 0.581778
2016-12-29 0.363736 0.720795
2016-12-30 0.092637 0.939559
[252 rows x 2 columns]
SPBO SPTS
Date
2016-01-04 0.982201 0.164563
2016-01-05 0.077662 0.483582
2016-01-06 0.832030 0.708076
2016-01-07 0.209025 0.882466
2016-01-08 0.466248 0.741131
... ... ...
2016-12-23 0.334045 0.363705
2016-12-27 0.403939 0.563553
2016-12-28 0.865186 0.358865
2016-12-29 0.558391 0.880689
2016-12-30 0.580391 0.767062
[252 rows x 2 columns]
SPHY XLU
Date
2016-01-04 0.973232 0.284925
2016-01-05 0.053767 0.821957
2016-01-06 0.742877 0.381189
2016-01-07 0.758185 0.209802
2016-01-08 0.501752 0.478717
... ... ...
2016-12-23 0.936663 0.380766
2016-12-27 0.782963 0.483526
2016-12-28 0.467642 0.162538
2016-12-29 0.396947 0.921549
2016-12-30 0.805437 0.240099
[252 rows x 2 columns]
AGG IEF
Date
2016-01-04 0.000220 0.999940
2016-01-05 0.847211 0.180482
2016-01-06 0.742697 0.370045
2016-01-07 0.013554 0.991153
2016-01-08 0.919934 0.132439
... ... ...
2016-12-23 0.254831 0.788551
2016-12-27 0.440686 0.543083
2016-12-28 0.198000 0.871293
2016-12-29 0.978527 0.043157
2016-12-30 0.754276 0.334097
[252 rows x 2 columns]
SPXL ^GSPC
Date
2016-01-04 0.566210 0.366002
2016-01-05 0.648482 0.374914
2016-01-06 0.327790 0.615203
2016-01-07 0.329700 0.576472
2016-01-08 0.338062 0.622151
... ... ...
2016-12-23 0.249566 0.759801
2016-12-27 0.606912 0.410120
2016-12-28 0.688278 0.287210
2016-12-29 0.283817 0.723907
2016-12-30 0.909136 0.087634
[252 rows x 2 columns]
GDX GDXJ
Date
2016-01-04 0.446633 0.581536
2016-01-05 0.131133 0.867941
2016-01-06 0.534869 0.469571
2016-01-07 0.958817 0.063141
2016-01-08 0.627413 0.359751
... ... ...
2016-12-23 0.588073 0.412453
2016-12-27 0.302559 0.736770
2016-12-28 0.109722 0.903831
2016-12-29 0.864757 0.955641
2016-12-30 0.713513 0.109576
[252 rows x 2 columns]
KBE KRE
Date
2016-01-04 0.543101 0.288144
2016-01-05 0.329151 0.684177
2016-01-06 0.133592 0.810731
2016-01-07 0.325813 0.449620
2016-01-08 0.378268 0.482930
... ... ...
2016-12-23 0.594045 0.429849
2016-12-27 0.152916 0.875860
2016-12-28 0.714662 0.242976
2016-12-29 0.413232 0.553474
2016-12-30 0.176862 0.846263
[252 rows x 2 columns]
LQD SPIB
Date
2016-01-04 0.015712 0.996347
2016-01-05 0.139375 0.932670
2016-01-06 0.744309 0.466668
2016-01-07 0.460971 0.649756
2016-01-08 0.341869 0.771068
... ... ...
2016-12-23 0.642674 0.486539
2016-12-27 0.366097 0.566938
2016-12-28 0.436188 0.782848
2016-12-29 0.887569 0.262805
2016-12-30 0.148570 0.940116
[252 rows x 2 columns]
CWB XLY
Date
2016-01-04 0.483766 0.121816
2016-01-05 0.860853 0.183776
2016-01-06 0.573868 0.205053
2016-01-07 0.288409 0.152153
2016-01-08 0.360738 0.308401
... ... ...
2016-12-23 0.989979 0.027021
2016-12-27 0.974290 0.089531
2016-12-28 0.113829 0.653353
2016-12-29 0.889807 0.157514
2016-12-30 0.482725 0.296585
[252 rows x 2 columns]
SPIP TIP
Date
2016-01-04 0.413764 0.625344
2016-01-05 0.735042 0.268918
2016-01-06 0.504450 0.517494
2016-01-07 0.630434 0.370276
2016-01-08 0.554878 0.442913
... ... ...
2016-12-23 0.969487 0.028385
2016-12-27 0.081850 0.918370
2016-12-28 0.584895 0.423728
2016-12-29 0.161154 0.838859
2016-12-30 0.454774 0.566649
[252 rows x 2 columns]
MBB SPTI
Date
2016-01-04 0.977278 0.005634
2016-01-05 0.718009 0.296284
2016-01-06 0.875971 0.581059
2016-01-07 0.505277 0.597364
2016-01-08 0.587366 0.470657
... ... ...
2016-12-23 0.506366 0.504158
2016-12-27 0.692244 0.242352
2016-12-28 0.615222 0.670542
2016-12-29 0.940646 0.261214
2016-12-30 0.622969 0.564870
[252 rows x 2 columns]
MUB TYD
Date
2016-01-04 0.007194 0.998796
2016-01-05 0.830072 0.235688
2016-01-06 0.980671 0.114181
2016-01-07 0.863759 0.210812
2016-01-08 0.642631 0.383883
... ... ...
2016-12-23 0.837998 0.198215
2016-12-27 0.564727 0.340332
2016-12-28 0.933311 0.172574
2016-12-29 0.483015 0.988969
2016-12-30 0.682376 0.077697
[252 rows x 2 columns]
SPAB SPTL
Date
2016-01-04 0.230168 0.776569
2016-01-05 0.948228 0.049650
2016-01-06 0.707333 0.529760
2016-01-07 0.044088 0.957673
2016-01-08 0.678943 0.340254
... ... ...
2016-12-23 0.794868 0.206137
2016-12-27 0.662520 0.332517
2016-12-28 0.225548 0.778883
2016-12-29 0.967456 0.044619
2016-12-30 0.702897 0.295207
[252 rows x 2 columns]
def long_only(history_df, probs):
new = df[history_df.columns]
new.columns = new.columns + ["_prices"]
check = pd.concat([new,history_df],axis=1).dropna()
check[f'{check.columns[2]} Long'] = np.where(check[check.columns[2]]<= probs, 1, 0)
check[f'{check.columns[3]} Long'] = np.where(check[check.columns[3]]<= probs, 1, 0)
return check
long_only(history[6], 0.1)
| SPBO_prices | SPTS_prices | SPBO | SPTS | SPBO Long | SPTS Long | |
|---|---|---|---|---|---|---|
| Date | ||||||
| 2016-01-04 | 24.842628 | 27.543909 | 0.982201 | 0.164563 | 0 | 0 |
| 2016-01-05 | 24.667610 | 27.525602 | 0.077662 | 0.483582 | 1 | 0 |
| 2016-01-06 | 24.826712 | 27.580503 | 0.832030 | 0.708076 | 0 | 0 |
| 2016-01-07 | 24.794901 | 27.607965 | 0.209025 | 0.882466 | 0 | 0 |
| 2016-01-08 | 24.802851 | 27.626270 | 0.466248 | 0.741131 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... |
| 2016-12-23 | 25.778719 | 27.693068 | 0.334045 | 0.363705 | 0 | 0 |
| 2016-12-27 | 25.754118 | 27.693068 | 0.403939 | 0.563553 | 0 | 0 |
| 2016-12-28 | 25.829756 | 27.696762 | 0.865186 | 0.358865 | 0 | 0 |
| 2016-12-29 | 25.903746 | 27.752234 | 0.558391 | 0.880689 | 0 | 0 |
| 2016-12-30 | 25.944838 | 27.779961 | 0.580391 | 0.767062 | 0 | 0 |
252 rows × 6 columns
returns = pd.DataFrame.from_dict(algo_returns)
returns = np.exp(returns) - 1 # convert log-returns to simple returns
total_ret = returns.sum(axis=1) / len(returns.columns) * 2 # double capital (from short positions)
strat = pd.DataFrame(total_ret+1, columns = ['Algo'])
strat['returns'] = np.nancumprod(total_ret+1)
strat
| Algo | returns | |
|---|---|---|
| Date | ||
| 2016-01-04 | 1.000000 | 1.000000 |
| 2016-01-05 | 1.000595 | 1.000595 |
| 2016-01-06 | 1.002417 | 1.003013 |
| 2016-01-07 | 0.999864 | 1.002877 |
| 2016-01-08 | 0.999089 | 1.001963 |
| ... | ... | ... |
| 2016-12-23 | 1.000121 | 1.077592 |
| 2016-12-27 | 0.999687 | 1.077255 |
| 2016-12-28 | 1.000970 | 1.078299 |
| 2016-12-29 | 0.998197 | 1.076355 |
| 2016-12-30 | 1.000170 | 1.076537 |
252 rows × 2 columns
benchmark = ["^GSPC"]
final = main.load_sql().set_index("Date")
end = strat.index[-1]
bm = main.dataframe(benchmark, final, "2000-01-01", end).pct_change().dropna()[strat.index[0]:]
bm["Benchmark"] = np.nancumprod(bm+1)
final = pd.concat([strat, bm],axis=1)
copula_strat = final[['returns','Benchmark']].reset_index()
fig = px.line(copula_strat, x="Date", y=copula_strat.columns,
hover_data={"Date": "|%B %d, %Y"},
title='Strategy Returns')
fig.update_xaxes(
dtick="M1",
tickformat="%b\n%Y")
fig.show()
ind_strat = []
for i in algo_returns.keys():
new = pd.DataFrame(algo_returns[i], columns = [i])
ind_strat.append(new)
final_df = pd.concat(ind_strat, axis=1).fillna(0)
final_df_des = final_df.describe()
cumprod_daily_pct_change = (1 + final_df).cumprod().reset_index()
fig = px.line(cumprod_daily_pct_change, x="Date", y=cumprod_daily_pct_change.columns,
hover_data={"Date": "|%B %d, %Y"},
title='Pair Returns')
fig.update_xaxes(
dtick="M1",
tickformat="%b\n%Y")
fig.show()
### good idea would be to match this with fundamental momentum strategy (roic)